Abstract
This project is all about applications of SLR to real data using R. I am using a set of data containing the quality of red wine based on various physiochemical properties. The data for this project was found in a UCI archive for machine learning. I will be using this dataset and my leanrings from this semester to analyze the data. The goal is to use Simple Linear Regression on the data to find the correlation between the properties and the overall taste quality of the red wine. I aim to find whether or not there is a correlation to these properties and how we as humans rate their qualities.#<center>
#{ width=20% }
#</center>
#<video width="320" height="240" controls>
# <source src="usingvideoinrmd.mp4" type="video/mp4">
#Your browser does not support the video tag.
#</video>
library(s20x)
wine=read.csv("winequality-red.csv")
trendscatter(wine$quality~wine$volatile.acidity,f=0.3,data=wine)
wine.fit=lm(wine$quality~wine$volatile.acidity,data=wine)
summary(wine.fit)
##
## Call:
## lm(formula = wine$quality ~ wine$volatile.acidity, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.79071 -0.54411 -0.00687 0.47350 2.93148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.56575 0.05791 113.39 <2e-16 ***
## wine$volatile.acidity -1.76144 0.10389 -16.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7437 on 1597 degrees of freedom
## Multiple R-squared: 0.1525, Adjusted R-squared: 0.152
## F-statistic: 287.4 on 1 and 1597 DF, p-value: < 2.2e-16
eovcheck(wine.fit)
summary(wine.fit)
##
## Call:
## lm(formula = wine$quality ~ wine$volatile.acidity, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.79071 -0.54411 -0.00687 0.47350 2.93148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.56575 0.05791 113.39 <2e-16 ***
## wine$volatile.acidity -1.76144 0.10389 -16.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7437 on 1597 degrees of freedom
## Multiple R-squared: 0.1525, Adjusted R-squared: 0.152
## F-statistic: 287.4 on 1 and 1597 DF, p-value: < 2.2e-16
plot(wine$quality~wine$volatile.acidity,main="Fitted model",xlab="alcohol",ylab="quality",data=wine)
abline(wine.fit)
ciReg(wine.fit)
## 95 % C.I.lower 95 % C.I.upper
## (Intercept) 6.45217 6.67932
## wine$volatile.acidity -1.96522 -1.55765
plot(wine.fit,which=1)
normcheck(wine.fit, shapiro.wilk = TRUE)
cooks20x(wine.fit)
wine.ph=lm(wine$quality~wine$pH,data=wine)
wine.tsd=lm(wine$quality~wine$total.sulfur.dioxide,data=wine)
wine.alcohol=lm(wine$quality~wine$alcohol,data=wine)
wine.va=lm(wine$quality~wine$volatile.acidity,data=wine)
wine.fsd=lm(wine$quality~wine$free.sulfur.dioxide,data=wine)
wine.fa=lm(wine$quality~wine$fixed.acidity,data=wine)
wine.rs=lm(wine$quality~wine$residual.sugar,data=wine)
wine.chl=lm(wine$quality~wine$chlorides,data=wine)
wine.dens=lm(wine$quality~wine$density,data=wine)
wine.ca=lm(wine$quality~wine$citric.acid,data=wine)
summary(wine.ph)
##
## Call:
## lm(formula = wine$quality ~ wine$pH, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6817 -0.6394 0.3032 0.3878 2.4874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.6359 0.4332 15.320 <2e-16 ***
## wine$pH -0.3020 0.1307 -2.311 0.021 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8065 on 1597 degrees of freedom
## Multiple R-squared: 0.003333, Adjusted R-squared: 0.002709
## F-statistic: 5.34 on 1 and 1597 DF, p-value: 0.02096
summary(wine.tsd)
##
## Call:
## lm(formula = wine$quality ~ wine$total.sulfur.dioxide, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8063 -0.6336 0.2164 0.3800 2.5527
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.8471792 0.0343670 170.140 < 2e-16 ***
## wine$total.sulfur.dioxide -0.0045442 0.0006037 -7.527 8.62e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7939 on 1597 degrees of freedom
## Multiple R-squared: 0.03426, Adjusted R-squared: 0.03366
## F-statistic: 56.66 on 1 and 1597 DF, p-value: 8.622e-14
summary(wine.alcohol)
##
## Call:
## lm(formula = wine$quality ~ wine$alcohol, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8442 -0.4112 -0.1690 0.5166 2.5888
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.87497 0.17471 10.73 <2e-16 ***
## wine$alcohol 0.36084 0.01668 21.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7104 on 1597 degrees of freedom
## Multiple R-squared: 0.2267, Adjusted R-squared: 0.2263
## F-statistic: 468.3 on 1 and 1597 DF, p-value: < 2.2e-16
summary(wine.va)
##
## Call:
## lm(formula = wine$quality ~ wine$volatile.acidity, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.79071 -0.54411 -0.00687 0.47350 2.93148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.56575 0.05791 113.39 <2e-16 ***
## wine$volatile.acidity -1.76144 0.10389 -16.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7437 on 1597 degrees of freedom
## Multiple R-squared: 0.1525, Adjusted R-squared: 0.152
## F-statistic: 287.4 on 1 and 1597 DF, p-value: < 2.2e-16
summary(wine.fsd)
##
## Call:
## lm(formula = wine$quality ~ wine$free.sulfur.dioxide, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6864 -0.6394 0.3215 0.3762 2.4661
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.698107 0.036678 155.357 <2e-16 ***
## wine$free.sulfur.dioxide -0.003911 0.001929 -2.027 0.0428 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8068 on 1597 degrees of freedom
## Multiple R-squared: 0.002566, Adjusted R-squared: 0.001941
## F-statistic: 4.109 on 1 and 1597 DF, p-value: 0.04283
summary(wine.fa)
##
## Call:
## lm(formula = wine$quality ~ wine$fixed.acidity, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8248 -0.6061 0.1925 0.4341 2.5550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.15732 0.09789 52.684 < 2e-16 ***
## wine$fixed.acidity 0.05754 0.01152 4.996 6.5e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8016 on 1597 degrees of freedom
## Multiple R-squared: 0.01539, Adjusted R-squared: 0.01477
## F-statistic: 24.96 on 1 and 1597 DF, p-value: 6.496e-07
summary(wine.rs)
##
## Call:
## lm(formula = wine$quality ~ wine$residual.sugar, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6609 -0.6334 0.3580 0.3690 2.3729
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.616055 0.041616 134.950 <2e-16 ***
## wine$residual.sugar 0.007865 0.014331 0.549 0.583
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8077 on 1597 degrees of freedom
## Multiple R-squared: 0.0001886, Adjusted R-squared: -0.0004375
## F-statistic: 0.3012 on 1 and 1597 DF, p-value: 0.5832
summary(wine.chl)
##
## Call:
## lm(formula = wine$quality ~ wine$chlorides, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6946 -0.6503 0.3010 0.3607 2.3607
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.82948 0.04229 137.852 < 2e-16 ***
## wine$chlorides -2.21184 0.42578 -5.195 2.31e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8011 on 1597 degrees of freedom
## Multiple R-squared: 0.01662, Adjusted R-squared: 0.016
## F-statistic: 26.99 on 1 and 1597 DF, p-value: 2.313e-07
summary(wine.dens)
##
## Call:
## lm(formula = wine$quality ~ wine$density, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7885 -0.6216 0.1554 0.4271 2.5177
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80.24 10.51 7.636 3.83e-14 ***
## wine$density -74.85 10.54 -7.100 1.87e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7954 on 1597 degrees of freedom
## Multiple R-squared: 0.0306, Adjusted R-squared: 0.02999
## F-statistic: 50.41 on 1 and 1597 DF, p-value: 1.875e-12
summary(wine.ca)
##
## Call:
## lm(formula = wine$quality ~ wine$citric.acid, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0011 -0.5976 0.1021 0.5057 2.5901
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.38172 0.03372 159.610 <2e-16 ***
## wine$citric.acid 0.93845 0.10104 9.288 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7869 on 1597 degrees of freedom
## Multiple R-squared: 0.05124, Adjusted R-squared: 0.05065
## F-statistic: 86.26 on 1 and 1597 DF, p-value: < 2.2e-16
library(d3scatter)
library(crosstalk)
library(leaflet)
library(DT)
shared_wine <- SharedData$new(wine[])
bscols(widths = c(5,NA,NA),
list(
filter_checkbox("quality", "Quality", shared_wine, ~quality, inline = TRUE),
filter_slider("sulphates", "Sulphates", shared_wine, ~sulphates, width = "100%")#,
#filter_select("auto", "Automatic", shared_mtcars, ~ifelse(am == 0, "Yes", "No"))
),
d3scatter(shared_wine, ~sulphates, ~quality, ~factor(pH), width="100%", height=250),
d3scatter(shared_wine, ~sulphates, ~quality, ~factor(quality), width="100%", height=250)
)
Wine is one of humanities oldest alcoholic beverages. It has been used recreationally, religiously, and medicinally throughout the millennia. Wine was seen as a creation by the Greek god Dionysos (or Bacchus, if you were Roman) to the people. It was said that the god gifted the vine to a man named Ikarios. Ikarios used this gift to make wine, which he was inclined to share with his fellow people. When he shared the wine, the others got freaked out by the effects, thinking that they had been poisoned. They ended up murdering Ikarios, and his creation spread from there.
Although there are still religious rituals that use wine, such as the Christian ritual known as “communion” or the “Eucharist”, it is used far more frequently for recreational purposes. Wine makes up roughly 11.4% of all alcohol sold in America, which comes out to ~ 3.8 billion liters of the fruity beverage.
With that much wine being sold in just America alone, companies have plenty of motivation to produce quality wines for people to enjoy. When it comes to the quality of a wine, much is subjective. How do you quantify quality of taste? The wine must be tasted by humans in order to determine the quality. With such subjective testing, its difficult to find what properties of the wine that people like. In order to try to find a trend in quality, various physiochemical properties have been recorded along with their coinsiding quality rating in this dataset.
I am a college student. Alcohol is well ingrained into the college culture. As a man who is part of said culture, I have had my fair share of alcoholic beverages. The more that my pallete has adjusted, the more I have become appreciative of wine. As a huge fan of juice growing up, it just makes sense that I make the gradual transition over to wine. It has quickly become one of my go-to drinks when it is available. With my growing interest in this drink, I have often wondered why some wines make me fall in love, whereas other wines make me fall over. This analysis is an attempt to seek out and identify the factors that contribute to this overall quality.
This data is pulled from a 2009 study in which researchers at the University of Minho in Portugal attempted to predict taste preferences using physiochemical properties of red wine. The dataset contains 1599 samples with 11 different chemical properties and the qualities for each sample. The properties measured are the fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, and alcohol. The quality is based on a scale of 1 (worst) to 10 (best).
wine=read.csv("winequality-red.csv")
head(wine)
I will analyze the data in an attempt to find a correlation between the amount compounds in a wine and its overall quality. I do not expect all compounds to make a statistical difference in quality, but I am hoping to find at least a few. The results of this analysis will help me and anyone else who reads this make better guesses as to the quality of wine based on the compounds it contains.
There are a lot of factors that could play a roll in the quality of the wine. I will show graphs of the wine quality given each compound.
##
## Shapiro-Wilk normality test
##
## data: wine.stder
## W = 0.92457, p-value < 2.2e-16
## `geom_smooth()` using formula 'y ~ x'
##
## Shapiro-Wilk normality test
##
## data: wine.stder
## W = 0.98876, p-value = 8.727e-10
## `geom_smooth()` using formula 'y ~ x'
##
## Shapiro-Wilk normality test
##
## data: wine.stder
## W = 0.96836, p-value < 2.2e-16
## `geom_smooth()` using formula 'y ~ x'
##
## Shapiro-Wilk normality test
##
## data: wine.stder
## W = 0.86397, p-value < 2.2e-16
## `geom_smooth()` using formula 'y ~ x'
##
## Shapiro-Wilk normality test
##
## data: wine.stder
## W = 0.89555, p-value < 2.2e-16
## `geom_smooth()` using formula 'y ~ x'
##
## Shapiro-Wilk normality test
##
## data: wine.stder
## W = 0.88769, p-value < 2.2e-16
## `geom_smooth()` using formula 'y ~ x'
##
## Shapiro-Wilk normality test
##
## data: wine.stder
## W = 0.94436, p-value < 2.2e-16
## `geom_smooth()` using formula 'y ~ x'
##
## Shapiro-Wilk normality test
##
## data: wine.stder
## W = 0.94445, p-value < 2.2e-16
## `geom_smooth()` using formula 'y ~ x'
##
## Shapiro-Wilk normality test
##
## data: wine.stder
## W = 0.89281, p-value < 2.2e-16
## `geom_smooth()` using formula 'y ~ x'
##
## Shapiro-Wilk normality test
##
## data: wine.stder
## W = 0.95736, p-value < 2.2e-16
## `geom_smooth()` using formula 'y ~ x'
##
## Shapiro-Wilk normality test
##
## data: wine.stder
## W = 0.97576, p-value = 8.737e-16
## `geom_smooth()` using formula 'y ~ x'
Testing1
library(ggplot2)
g = ggplot(mtcars, aes(x = disp, y = mpg, color = cyl)) + geom_point()
g = g + geom_smooth(method = "loess")
g
## `geom_smooth()` using formula 'y ~ x'
Graph of data with loess smoother
As you can see in the preliminary graphs, the data is mostly distributed noramlly, but there isn’t much of a linear trend to analyze. I believe that the quality would drop given that too much of any one compound is in the wine. That being said, I will still attempt to find some linear correlation within the data, even if the relationship is minimal. The histograms show that the independent variables have an uneven distribution of the frequency of values tested. This can cause unexpected and faux trends in the data. Even though this may skew some results, I will conveniently ignore those facts in order to produce some sort of regressional analysis. I will be make a probabilistic model of a simple linear regression. The model assumes that the line produce by \(\bar{y}\) given any value of \(x\). Any deviating points are are represented by \(\epsilon\). The equation of the line is: \[y=\beta_0+\beta_1x_i+\epsilon_i\] The variables \(\beta_0\), \(\beta_1\), and \(\epsilon_i\) are as follows: \(\beta_0\) and \(\beta_1\) are random variables, and \(\epsilon_i\) is the random error. If you drop \(\epsilon_i\), you get the equation for \(\bar{y}\) given any value of \(x\). This can be represented as such: \[E(y)=E(\beta_0+\beta_1x_i+\epsilon_i)\] \[=\beta_0+\beta_1x_i+E(\epsilon_i)\] \[=\beta_0+\beta_1x_i\] Given this information, we can represent this as \(E(Y|x)\) where \(\beta_0\) is the y-intercept and \(\beta_1\) is the slope. According to Mendenhall and Sinich 2016, we need to make the following assumptions about \(\epsilon\) in order to estimate the \(\beta\) parameters. These assumptions are:
Given these assumptions, we should be able to produce some estimates for the \(\beta\) parameters.
Since the data isn’t very linear and there are a lot of independent variables, I will be selecting 2 independent variables that show the best fit based on their multiple \(R^2\) value. The 2 variables with the highest multiple \(R^2\) values are alcohol and volatile acidity.
summary(wine.alc)
##
## Call:
## lm(formula = wine$quality ~ wine$alcohol, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8442 -0.4112 -0.1690 0.5166 2.5888
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.87497 0.17471 10.73 <2e-16 ***
## wine$alcohol 0.36084 0.01668 21.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7104 on 1597 degrees of freedom
## Multiple R-squared: 0.2267, Adjusted R-squared: 0.2263
## F-statistic: 468.3 on 1 and 1597 DF, p-value: < 2.2e-16
summary(wine.va)
##
## Call:
## lm(formula = wine$quality ~ wine$volatile.acidity, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.79071 -0.54411 -0.00687 0.47350 2.93148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.56575 0.05791 113.39 <2e-16 ***
## wine$volatile.acidity -1.76144 0.10389 -16.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7437 on 1597 degrees of freedom
## Multiple R-squared: 0.1525, Adjusted R-squared: 0.152
## F-statistic: 287.4 on 1 and 1597 DF, p-value: < 2.2e-16
With multiple \(R^2\) values of 0.2267 and 0.1525, respectively. While very low, they are the best I have to work with. Based on these summaries, we can say that \[\hat{\beta_{0a}}=1.87497\] \[\hat{\beta_{1a}}=0.36084\] and \[\hat{\beta_{0v}}=6.56575\] \[\hat{\beta_{1v}}=-1.76144\]
ciReg(wine.alc,conf.level=0.95,print.out=TRUE)
## 95 % C.I.lower 95 % C.I.upper
## (Intercept) 1.53229 2.21766
## wine$alcohol 0.32813 0.39355
ciReg(wine.va,conf.level=0.95,print.out=TRUE)
## 95 % C.I.lower 95 % C.I.upper
## (Intercept) 6.45217 6.67932
## wine$volatile.acidity -1.96522 -1.55765
\[\hat{\beta_{0a}}+\hat{\beta_{1a}}x_i=1.87497+0.36084x_i\] \[\hat{\beta_{0v}}+\hat{\beta_{1v}}x_i=6.56575-1.76144x_i\] The slope \(\hat{\beta_{1a}}=0.36084\) tells us that the quality rises 0.36084 for every 1 increase in alcohol. The slope \(\hat{\beta_{1v}}=-1.76144\) tells us that the quality lowers -1.76144 for every 1 increase in volatile acidity.
I will attempt to verify the above assumptions to show whether or not a straight line is the best fit for the model.
plot(wine$quality~wine$alcohol,bg="Purple",pch=21,cex=1.2,data=wine)
abline(wine.alc)
plot(wine$quality~wine$volatile.acidity,bg="Blue",pch=21,cex=1.2,data=wine)
abline(wine.va)
These plots do not indicate that a linear model is the best fit for the data.
The residuals tell us by how much points deviate from the fitted line. They help us to see just how far the points are varying from the line.
plot(wine$quality~wine$alcohol,bg="Green",pch=21,cex=1.2,data=wine)
abline(wine.alc)
yhat=with(wine,predict(wine.alc,data.frame(alcohol)))
with(wine,{segments(alcohol,quality,alcohol,yhat)})
abline(wine.alc)
plot(wine$quality~wine$volatile.acidity,bg="Maroon",pch=21,cex=1.2,data=wine)
abline(wine.va)
yhat=with(wine,predict(wine.va,data.frame(volatile.acidity)))
with(wine,{segments(volatile.acidity,quality,volatile.acidity,yhat)})
abline(wine.va)
As you can see, there is quite a lot of deviation from the line.
\[\epsilon_i \sim N(0,\sigma^2)\]
\[R_{adj}^2 =\]
predict()ciReg()Remember to interpret this plot and all other plots
A little footnote↩